* Andrii Parkhomenko
* Project: Urban Structure When Working from Home, with Matt Delventhal
* collegeprem_pwpuma_ind.do
* Purpose: construct college wage premia for each PWPUMA and industry
********************************************************************************
********************************************************************************

* ------------------------------------------------------------------------------
* Initial settings, read data, create variables

clear all

cd "D:\Dropbox\My research\Projects\Urban Structure When Working from Home\Data\data_in\Wages"
local dir_use D:\Data for research\IPUMS

use "`dir_use'\\usa_00156.dta"

local incometype incearn

if "`incometype'"=="incearn" {
	gen wageyr = incwage + incbus00
	}
if "`incometype'"=="incwage" {
	gen wageyr = incwage
	}

egen loc_work = concat(pwstate2 pwpuma00), punct("_")

* ------------------------------------------------------------------------------
* Construct hourly wages

gen hours = .
replace hours = uhrswork if uhrswork>0

gen weeks = .
replace weeks = 27+(39-27)/2 if wkswork2==3
replace weeks = 40+(47-40)/2 if wkswork2==4
replace weeks = 48+(49-48)/2 if wkswork2==5
replace weeks = 50+(52-50)/2 if wkswork2==6

gen hoursyr = hours*weeks if hours!=. & weeks!=. 
gen wagehr = wageyr / hoursyr if hours!=. & weeks!=.

* ------------------------------------------------------------------------------
* Clean data for calculating wages

gen labor_smpl = 1

replace labor_smpl = 0 if gq!=1													// drop observations in group quarters
replace labor_smpl = 0 if age<25 | age>64										// drop observations younger than 18 years old
replace labor_smpl = 0 if empstat!=1											// keep observations who report being employed
replace labor_smpl = 0 if wageyr==999999										// drop observations with absent info on wage income
replace labor_smpl = 0 if wageyr<=0 | inctot<=0									// drop observations with negative wage or total income
*replace labor_smpl = 0 if (ind1990>=900 | occ1990>=905 | occ1990==3)  			// drop government employees and military
replace labor_smpl = 0 if uhrswork<35  											// keep those who worked 35 hours or more last week
replace labor_smpl = 0 if wkswork2<3  											// keep those who worked more than 26 weeks last year
replace labor_smpl = 0 if (wagehr<5.15/2 & (multyear==2006 | multyear==2007)) | /// drop those who earn less than 1/2 minimum federal wage
	(wagehr<5.85/2 & multyear==2008) | (wagehr<6.55/2 & multyear==2009) | (wagehr<7.25/2 & multyear>=2010)

keep if labor_smpl==1
keep if year==2016

drop if pwstate2==0 | pwstate2==2 | pwstate2==15 | pwstate2>56 					// keep the 48 contiguous states and D.C.

* ------------------------------------------------------------------------------
* Construct NAICS industries and divide into tradable/non-tradable

gen indnaicsgroup = substr(indnaics,1,2)
replace indnaicsgroup = "33" if indnaics=="3MS"
replace indnaicsgroup = "45" if indnaics=="4M" | indnaics=="4MS"
destring indnaicsgroup, replace

gen industry = .
replace industry = 1 if indnaicsgroup==11 | indnaicsgroup==21 
replace industry = 2 if indnaicsgroup==23
replace industry = 3 if indnaicsgroup==31 | indnaicsgroup==32 | indnaicsgroup==33
replace industry = 4 if indnaicsgroup==42
replace industry = 5 if indnaicsgroup==44 | indnaicsgroup==45 
replace industry = 6 if indnaicsgroup==48 | indnaicsgroup==49 | indnaicsgroup==22 
replace industry = 7 if indnaicsgroup==51
replace industry = 8 if indnaicsgroup==52 | indnaicsgroup==53 
replace industry = 9 if indnaicsgroup==54 | indnaicsgroup==55 | indnaicsgroup==56 
replace industry = 10 if indnaicsgroup==61 | indnaicsgroup==62 
replace industry = 11 if indnaicsgroup==71 | indnaicsgroup==72 
replace industry = 12 if indnaicsgroup==81
replace industry = 13 if indnaicsgroup==92 & ~(ind1990>=940 & ind1990<=960)
replace industry = 14 if indnaicsgroup==92 & (ind1990>=940 & ind1990<=960)

label define industrylab 1 "Agriculture, forestry, fishing and hunting, and mining" ///
						 2 "Construction" ///
						 3 "Manufacturing" ///
						 4 "Wholesale trade" ///						 
						 5 "Retail trade" ///
						 6 "Transportation and warehousing, and utilities" ///
						 7 "Information" ///	
						 8 "Finance, insurance, real estate and rental and leasing" ///
						 9 "Professional, scientific, management, administrative,  and waste management services" ///
						 10 "Educational, health and social services" ///	
						 11 "Arts, entertainment, recreation, accommodation and food services" ///
						 12 "Other services (except public administration)" ///
						 13 "Public administration" ///
						 14 "Armed forces"
label values industry industrylab  

gen indtnt = .  // tradable/non-tradable
replace indtnt = 0 if industry==5 /// NT: "Retail trade"
					| industry==10 /// NT: "Educational, health and social services"
					| industry==11 /// NT: "Arts, entertainment, recreation, accommodation and food services"
					| industry==12 /// NT: "Other services (except public administration)"
					| industry==13 // NT: "Public administration" 	
replace indtnt = 1 if indtnt~=0
replace indtnt = 2 if industry==2  // "Construction"

label define indtntlab 0 "Non-tradable" ///
					   1 "Tradable" ///
					   2 "Construction"
label values indtnt indtntlab 

drop if industry==14
						 
* ------------------------------------------------------------------------------
* Create control variables

* Race and sex
gen male = 0
replace male = 1 if sex==1
gen white = 0
replace white = 1 if race==1 & hispan==0

* Education 
gen college = 0 
replace college = 1 if educ>=10

* Age
gen agegroup = .
replace agegroup = 22 if age<25
replace agegroup = 27 if age>=25 & age<30
replace agegroup = 32 if age>=30 & age<35
replace agegroup = 37 if age>=35 & age<40
replace agegroup = 42 if age>=40 & age<45
replace agegroup = 47 if age>=45 & age<50
replace agegroup = 52 if age>=50 & age<55
replace agegroup = 57 if age>=55 & age<60
replace agegroup = 62 if age>=60
qui tab agegroup, gen(Dage)
drop Dage1

* Occupation
forvalues di = 1/16 {  // occupation dummies; see https://usa.ipums.org/usa-action/variables/OCC1990#codes_section
	gen Docc`di' = 0
	}
replace Docc1 = 1 if (occ1990>=3 & occ1990<=22)     // Executive, Administrative, and Managerial Occupations
replace Docc2 = 1 if (occ1990>=23 & occ1990<=37)    // Management Related Occupations
replace Docc3 = 1 if (occ1990>=43 & occ1990<=200)   // Professional Specialty Occupations
replace Docc4 = 1 if (occ1990>=203 & occ1990<=235)  // Technicians and Related Support Occupations
replace Docc5 = 1 if (occ1990>=243 & occ1990<=283)  // Sales Occupations
replace Docc6 = 1 if (occ1990>=303 & occ1990<=389)  // Administrative Support Occupations, Including Clerical Supervisors, Administrative Support Occupations
replace Docc7 = 1 if (occ1990>=405 & occ1990<=407)  // Private Household Occupations
replace Docc8 = 1 if (occ1990>=415 & occ1990<=427)  // Protective Service Occupations
replace Docc9 = 1 if (occ1990>=434 & occ1990<=469)  // Service Occupations, Except Protective and Household Food Preparation and Service Occupations
replace Docc10 = 1 if (occ1990>=473 & occ1990<=498) // FARMING, FORESTRY, AND FISHING OCCUPATIONS
replace Docc11 = 1 if (occ1990>=503 & occ1990<=549) // Mechanics and Repairers
replace Docc12 = 1 if (occ1990>=558 & occ1990<=599) // Construction Trades
replace Docc13 = 1 if (occ1990>=614 & occ1990<=617) // Extractive Occupations
replace Docc14 = 1 if (occ1990>=628 & occ1990<=699) // Precision Production Occupations
replace Docc15 = 1 if (occ1990>=703 & occ1990<=799) // Machine Operators, Assemblers, and Inspectors
replace Docc16 = 1 if (occ1990>=803 & occ1990<=889) // Transportation and Material Moving Occupations

* ------------------------------------------------------------------------------
* Estimate college wage premia

gen logwageyr = log(wageyr)
gen logwagehr = log(wagehr)

gen collegeprem = .
gen collegeprem_se = .
gen indtntprem = .
gen indtntprem_se = .
gen nobs = .
gen sNL = .
gen sTL = .

egen locgroup = group(pwstate2 pwpuma00)
levelsof locgroup
global locgroup_list `r(levels)'

foreach j in $locgroup_list {

	/*reg logwageyr college male white Docc* i.industry Dage* i.multyear [w=perwt] if locgroup==`j', robust
	
	replace collegeprem = _b[college] if locgroup==`j'
	replace collegeprem_se = _se[college] if locgroup==`j'
	
	reg logwageyr indtnt college male white Docc* /*i.industry*/ Dage* i.multyear [w=perwt] if locgroup==`j', robust
	
	replace indtntprem = _b[indtnt] if locgroup==`j'
	replace indtntprem_se = _se[indtnt] if locgroup==`j'
	replace nobs = `e(N)' if locgroup==`j'*/
	
	reg logwageyr college indtnt male white Docc* Dage* i.multyear [w=perwt] if locgroup==`j', robust
	
	replace collegeprem = _b[college] if locgroup==`j'
	replace collegeprem_se = _se[college] if locgroup==`j'
	replace indtntprem = _b[indtnt] if locgroup==`j'
	replace indtntprem_se = _se[indtnt] if locgroup==`j'
	replace nobs = `e(N)' if locgroup==`j'
	
	sum perwt if locgroup==`j' & indtnt==0
	local sumperwtN = `r(sum)'
	sum perwt if locgroup==`j' & indtnt==1
	local sumperwtT = `r(sum)'
	sum perwt if college==0 & indtnt==0 & locgroup==`j'
	replace sNL = `r(sum)'/`sumperwtN' if locgroup==`j'
	sum perwt if college==0 & indtnt==1 & locgroup==`j'
	replace sTL = `r(sum)'/`sumperwtT' if locgroup==`j'
		
	}
	
save reg_out_temp, replace

* ------------------------------------------------------------------------------
* Clean, organize and save

gen fobs = 0
bys loc_work: replace fobs = 1 if _n==1
*bys loc_work indtnt: replace fobs = 1 if _n==1
keep if fobs==1

rename pwstate2 statename
gen statecode = statename

keep loc_work collegeprem* nobs statename statecode pwpuma00 year indtntprem* sNL sTL
order year statename statecode pwpuma00 loc_work collegeprem* indtntprem* sNL sTL nobs
sort year statecode pwpuma00 collegeprem* indtntprem* nobs

* Save at POWPUMA level
save collegeprem_wpuma_indtnt, replace
export delimited using collegeprem_wpuma_indtnt.csv, replace

* Save at model-location level
gen powstate = statecode
gen powpuma = pwpuma00
merge 1:m powstate powpuma using "D:\Dropbox\My research\Projects\Urban Structure When Working from Home\Data\data_in\Crosswalks\puma_powpuma"
keep if _merge==3
drop _merge

gen pumacode = statecode*100000 + puma
merge 1:m pumacode using "D:\Dropbox\My research\Projects\Urban Structure When Working from Home\Data\data_in\Crosswalks\tract_modelloc_crosswalk"
collapse (mean) collegeprem indtntprem sNL sTL, by(loccode)
save collegeprem_modelloc, replace
export delimited using collegeindtntprem_modelloc.csv, replace
